In this exercise you will simulate the conditional probability that an athlete took drugs, given he/she failed a drug test.
Scenario: Suppose you have a population of 100 athletes. It is known that 10% of athletes take drugs. There is a test that will correctly identify 80% of drug users and also incorrectly identify 20% of clean athletes as drug users.
Simulate this situation: Each athlete in population of size 100 has probability 0.1 of being a drug taker. For each simulated athlete, generate a random integer 1-10 - if this is 1 the simulated athlete takes drugs, if this is 2-10 does not take drugs. 80% of the population is correctly identified by the test - for each simulated athlete, generate a second random integer 1-10 - if this is 1 or 2 the simulated athlete’s test is incorrect, otherwise correct.
Athletes who fail the test include drug takers correctly identified as cheats and clean athletes incorrectly identified as cheats. So a simulated athlete is deemed to have failed if his/her 1st random integer is 1 and 2nd random integer is 3-10 OR 1st integer 2-10 and 2nd integer 1-2.
You are now in a position to calculate the first simulated conditional probability of drug taking among the athletes who failed the test.
simulation <- function(
n = 100,
p_drug = 0.1,
p_true_pos = 0.8,
p_false_pos = 0.2,
two_tests = FALSE) {
# population of athletes
drug <- runif(n) < p_drug
# test results
test <- runif(n)
# simulate drug test
# if drug taker and correctly identified or clean and incorrectly identified
if (two_tests) {
second_test <- runif(n)
result <- (drug & test <= p_true_pos & second_test <= p_true_pos) |
(!drug & test <= p_false_pos & second_test <= p_false_pos)
} else {
result <- (drug & test <= p_true_pos) | (!drug & test <= p_false_pos)
}
# calculate conditional probability of drug taking given failed test
p_drug_fail <- sum(drug & result) / sum(result)
return(p_drug_fail)
}
simulation()
## [1] 0.32
plot_density <- function(simulations, main = "P(takes drugs | failed test)") {
hist(simulations,
main = main,
freq = FALSE,
xlab = "Probability",
col = "skyblue",
border = "white",
xlim = c(0, 1)
)
lines(density(simulations),
lwd = 2,
col = "red"
)
}
# repeat simulation 1000 times
simulations <- replicate(1000, simulation())
# plot histogram of simulated probabilities
plot_density(simulations)
# probability of drug taking in the population of athletes increased to 20%
p_drug_simulations <- replicate(1000, simulation(p_drug = 0.2))
# plot histogram of simulated probabilities
plot_density(p_drug_simulations,
main = "P(takes drugs | failed test) (20% drug takers)"
)
# proportion of drug takers correctly identified by the test increased to 90%
p_true_pos_simulations <- replicate(1000, simulation(p_true_pos = 0.9))
# plot histogram of simulated probabilities
plot_density(p_true_pos_simulations,
main = "P(takes drugs | failed test) (90% true positive rate)"
)
# false positive rate increased to 10%
p_false_pos_simulations <- replicate(1000, simulation(p_false_pos = 0.1))
# plot histogram of simulated probabilities
plot_density(p_false_pos_simulations,
main = "P(takes drugs | failed test) (10% false positive rate)"
)
# compare the simulations
par(mfrow = c(2, 2))
plot_density(simulations, main = "P(takes drugs | failed test)")
plot_density(p_drug_simulations,
main = "P(takes drugs | failed test) (20% drug takers)"
)
plot_density(p_true_pos_simulations,
main = "P(takes drugs | failed test) (90% true positive rate)"
)
plot_density(p_false_pos_simulations,
main = "P(takes drugs | failed test) (10% false positive rate)"
)
# repeat simulation 1000 times
simulations_two_tests <- replicate(1000, simulation(two_tests = TRUE))
# plot histogram of simulated probabilities
par(mfrow = c(1, 1))
plot_density(simulations_two_tests,
main = "P(takes drugs | failed test) (two tests)"
)
# Bayes' Theorem
# P(takes drugs | failed test) = P(takes drugs and failed test) / P(failed test)
p_drug <- 0.1
p_true_pos <- 0.8
p_false_pos <- 0.2
print(paste(
"P(takes drugs | failed test): ",
p_drug * p_true_pos / (p_drug * p_true_pos + (1 - p_drug) * p_false_pos)
))
## [1] "P(takes drugs | failed test): 0.307692307692308"
Initially choose two distributions (one discrete and one continuous).
Selected: Poisson and Gamma distributions.
points <- seq(0, 100, by = 1)
pdfs <- list()
cdfs <- list()
# Binomial distribution
N <- 100
p <- 0.25
binomial_pmf <- dbinom(points, size = N, prob = p)
pdfs$Binomial <- binomial_pmf
binomial_cdf <- pbinom(points, size = N, prob = p)
cdfs$Binomial <- binomial_cdf
# Geometric distribution
p <- 0.25
geometric_pmf <- dgeom(points, prob = p)
pdfs$Geometric <- geometric_pmf
geometric_cdf <- pgeom(points, prob = p)
cdfs$Geometric <- geometric_cdf
# Poisson distribution
lambda <- 25
poisson_pmf <- dpois(points, lambda = lambda)
pdfs$Poisson <- poisson_pmf
poisson_cdf <- ppois(points, lambda = lambda)
cdfs$Poisson <- poisson_cdf
# Continuous uniform distribution
uniform_pdf <- dunif(points, min = 0, max = 1)
pdfs$Uniform <- uniform_pdf
uniform_cdf <- punif(points, min = 0, max = 1)
cdfs$Uniform <- uniform_cdf
# Exponential distribution
rate <- 1
exponential_pdf <- dexp(points, rate = rate)
pdfs$Exponential <- exponential_pdf
exponential_cdf <- pexp(points, rate = rate)
cdfs$Exponential <- exponential_cdf
# Normal distribution
mean <- 25
sd <- 5
normal_pdf <- dnorm(points, mean = mean, sd = sd)
pdfs$Normal <- normal_pdf
normal_cdf <- pnorm(points, mean = mean, sd = sd)
cdfs$Normal <- normal_cdf
# Student's t distribution
df <- 25
t_pdf <- dt(points, df = df)
pdfs$Student <- t_pdf
t_cdf <- pt(points, df = df)
cdfs$Student <- t_cdf
# Gamma distribution
shape <- 25
scale <- 1
gamma_pdf <- dgamma(points, shape = shape, scale = scale)
pdfs$Gamma <- gamma_pdf
gamma_cdf <- pgamma(points, shape = shape, scale = scale)
cdfs$Gamma <- gamma_cdf
plot_pdf_cdf <- function(points, pdfs, cdfs) {
n <- length(pdfs)
col_pal <- rainbow(n)
# par(mfcol = c(2 * ceiling(sqrt(n)), ceiling(sqrt(n))))
par(mfrow = c(1, 2))
for (i in 1:n) {
plot(points, pdfs[[i]],
lwd = 2,
type = "h",
xlab = "x",
ylab = "Density",
main = paste(names(pdfs)[i], "PDF"),
col = col_pal[i]
)
plot(points, cdfs[[i]],
lwd = 2,
type = "h",
xlab = "x",
ylab = "Cumulative density",
main = paste(names(cdfs)[i], "CDF"),
col = col_pal[i]
)
}
}
plot_pdf_cdf(points, pdfs, cdfs)
n <- 100
generate_samples <- function(n) {
samples <- list()
# Binomial distribution
samples$Binomial <- rbinom(n, size = N, prob = p)
# Geometric distribution
samples$Geometric <- rgeom(n, prob = p)
# Poisson distribution
samples$Poisson <- rpois(n, lambda = lambda)
# Continuous uniform distribution
samples$Uniform <- runif(n, min = 0, max = 1)
# Exponential distribution
samples$Exponential <- rexp(n, rate = rate)
# Normal distribution
samples$Normal <- rnorm(n, mean = mean, sd = sd)
# Student's t distribution
samples$Student <- rt(n, df = df)
# Gamma distribution
samples$Gamma <- rgamma(n, shape = shape, scale = scale)
return(samples)
}
samples <- generate_samples(n)
Plot histograms of the relative frequency distribution of your random sample using appropriate bin widths.
Overlay the true probability density function. Do they match?
plot_distributions <- function(points, samples, pdfs) {
n <- length(samples)
col_pal <- rainbow(n)
# par(mfrow = c(ceiling(sqrt(n / 2)), 2 * ceiling(sqrt(n / 2))))
par(mfrow = c(1, 1))
for (i in 1:n) {
m <- length(samples[[i]])
hist(samples[[i]],
freq = FALSE,
xlab = "x",
ylab = "Density",
main = paste(names(samples)[i], " distribution (n = ", m, ")", sep = ""),
col = col_pal[i],
border = "white"
)
lines(points, pdfs[[i]],
lwd = 2,
col = "red"
)
}
}
plot_distributions(points, samples, pdfs)
print_mean_var <- function(name, mean, var) {
print(paste(name, "distribution: mean =", mean))
print(paste(name, "distribution: variance =", var))
}
# Binomial distribution
print_mean_var("Binomial", N * p, N * p * (1 - p))
## [1] "Binomial distribution: mean = 25"
## [1] "Binomial distribution: variance = 18.75"
# Geometric distribution
print_mean_var("Geometric", 1 / p, (1 - p) / p^2)
## [1] "Geometric distribution: mean = 4"
## [1] "Geometric distribution: variance = 12"
# Poisson distribution
print_mean_var("Poisson", lambda, lambda)
## [1] "Poisson distribution: mean = 25"
## [1] "Poisson distribution: variance = 25"
# Continuous uniform distribution
print_mean_var("Uniform", 0.5, 1 / 12)
## [1] "Uniform distribution: mean = 0.5"
## [1] "Uniform distribution: variance = 0.0833333333333333"
# Exponential distribution
print_mean_var("Exponential", 1 / rate, 1 / rate^2)
## [1] "Exponential distribution: mean = 1"
## [1] "Exponential distribution: variance = 1"
# Normal distribution
print_mean_var("Normal", mean, sd^2)
## [1] "Normal distribution: mean = 25"
## [1] "Normal distribution: variance = 25"
# Student's t distribution
print_mean_var("Student", 0, df / (df - 2))
## [1] "Student distribution: mean = 0"
## [1] "Student distribution: variance = 1.08695652173913"
# Gamma distribution
print_mean_var("Gamma", shape * scale, shape * scale^2)
## [1] "Gamma distribution: mean = 25"
## [1] "Gamma distribution: variance = 25"
for (i in seq_along(samples)) {
print_mean_var(names(samples)[i], mean(samples[[i]]), var(samples[[i]]))
}
## [1] "Binomial distribution: mean = 25.26"
## [1] "Binomial distribution: variance = 21.1640404040404"
## [1] "Geometric distribution: mean = 2.53"
## [1] "Geometric distribution: variance = 9.03949494949495"
## [1] "Poisson distribution: mean = 25.41"
## [1] "Poisson distribution: variance = 25.4766666666667"
## [1] "Uniform distribution: mean = 0.453228120745625"
## [1] "Uniform distribution: variance = 0.0795157551266729"
## [1] "Exponential distribution: mean = 1.02783821715451"
## [1] "Exponential distribution: variance = 0.950414576846789"
## [1] "Normal distribution: mean = 24.3083435118665"
## [1] "Normal distribution: variance = 22.5447103241993"
## [1] "Student distribution: mean = -0.123727317840974"
## [1] "Student distribution: variance = 1.23263373148248"
## [1] "Gamma distribution: mean = 24.8788326577764"
## [1] "Gamma distribution: variance = 30.0248791843826"
n_values <- c(250, 500, 1000)
for (n in n_values) {
samples <- generate_samples(n)
plot_distributions(points, samples, pdfs)
for (i in seq_along(samples)) {
print_mean_var(names(samples)[i], mean(samples[[i]]), var(samples[[i]]))
}
}
## [1] "Binomial distribution: mean = 25.392"
## [1] "Binomial distribution: variance = 18.3437108433735"
## [1] "Geometric distribution: mean = 3.14"
## [1] "Geometric distribution: variance = 13.4060240963855"
## [1] "Poisson distribution: mean = 25.14"
## [1] "Poisson distribution: variance = 24.9803212851406"
## [1] "Uniform distribution: mean = 0.501526791631244"
## [1] "Uniform distribution: variance = 0.0816809141374117"
## [1] "Exponential distribution: mean = 1.03654288699367"
## [1] "Exponential distribution: variance = 1.11857353139697"
## [1] "Normal distribution: mean = 25.1669435976267"
## [1] "Normal distribution: variance = 27.6196038137966"
## [1] "Student distribution: mean = -0.0272425654411244"
## [1] "Student distribution: variance = 1.11500048190876"
## [1] "Gamma distribution: mean = 25.0679681233418"
## [1] "Gamma distribution: variance = 21.5183408284542"
## [1] "Binomial distribution: mean = 25.222"
## [1] "Binomial distribution: variance = 19.1630420841683"
## [1] "Geometric distribution: mean = 2.888"
## [1] "Geometric distribution: variance = 11.8832224448898"
## [1] "Poisson distribution: mean = 25"
## [1] "Poisson distribution: variance = 26.565130260521"
## [1] "Uniform distribution: mean = 0.489496289813425"
## [1] "Uniform distribution: variance = 0.0888922311995496"
## [1] "Exponential distribution: mean = 0.966758875139702"
## [1] "Exponential distribution: variance = 0.856422618745479"
## [1] "Normal distribution: mean = 24.9949648184796"
## [1] "Normal distribution: variance = 21.5736614714004"
## [1] "Student distribution: mean = -0.039675728457813"
## [1] "Student distribution: variance = 1.012270570921"
## [1] "Gamma distribution: mean = 25.1327327266588"
## [1] "Gamma distribution: variance = 23.7069820629858"
## [1] "Binomial distribution: mean = 25.02"
## [1] "Binomial distribution: variance = 18.5321321321321"
## [1] "Geometric distribution: mean = 3.144"
## [1] "Geometric distribution: variance = 13.1464104104104"
## [1] "Poisson distribution: mean = 24.971"
## [1] "Poisson distribution: variance = 23.9541131131131"
## [1] "Uniform distribution: mean = 0.501945010180119"
## [1] "Uniform distribution: variance = 0.0811192702888404"
## [1] "Exponential distribution: mean = 0.97887561878205"
## [1] "Exponential distribution: variance = 0.931017999051545"
## [1] "Normal distribution: mean = 24.9588078836275"
## [1] "Normal distribution: variance = 24.0557148069005"
## [1] "Student distribution: mean = 0.0116912960498983"
## [1] "Student distribution: variance = 1.00581697897865"
## [1] "Gamma distribution: mean = 25.1858277192436"
## [1] "Gamma distribution: variance = 28.92632956119"